set path for data file

setwd("/Users/shengwong/Documents/GitHub/Fall2018-Proj1-wsan5277yyyy/output")
library(tidyverse)
library(tidytext)
library(DT)
library(scales)
library(wordcloud2)
library(gridExtra)
library(ngram)
library(shiny) 
library(tidyr)
library(scales)
library(dplyr)
library(janeaustenr)
library(wordcloud)

Step 1 - Load the processed text data along with demographic information on contributors

I use the processed data for our analysis and combine it with the demographic information available.

hm_data <- read_csv("../output/processed_moments.csv")

urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/demographic.csv'
demo_data <- read_csv(urlfile)

Combine both the data sets and keep the required columns for analysis

I select a subset of the data that satisfies specific row conditions.

hm_data <- hm_data %>%
  inner_join(demo_data, by = "wid") %>%
  select(wid,
         original_hm,
         cleaned_hm,
         gender, 
         marital, 
         parenthood,
         reflection_period,
         age, 
         country, 
         ground_truth_category, 
         text,
         predicted_category) %>%
  mutate(count = sapply(hm_data$text, wordcount)) %>%
#  mutate(hm_nchar = sapply(hm_data$cleaned_hm, nchar)) %>%
  filter(gender %in% c("m", "f")) %>%
  filter(marital %in% c("single", "married")) %>%
  filter(parenthood %in% c("n", "y")) %>%
  filter(reflection_period %in% c("24h", "3m")) %>%
  mutate(reflection_period = fct_recode(reflection_period, 
                                        months_3 = "3m", hours_24 = "24h"))

Divide hm_data by age group

hm_data$age <- as.integer(hm_data$age)
## Warning: NAs introduced by coercion
new_data <- hm_data[-which(is.na(hm_data$age)),]
new_data$text <- as.character(new_data$text)
new_data$words <- sapply(strsplit(new_data$cleaned_hm, " "), length)
new_data$words <- ifelse(new_data$words < 50, new_data$words*1,  new_data$words*0)
group1 <-new_data[which(new_data$age >=17 & new_data$age <= 22),]
group1$group <- 1
group2 <-new_data[which(new_data$age >=23 & new_data$age <= 39),]
group2$group <- 2
group3 <-new_data[which(new_data$age >=40 & new_data$age <= 59),]
group3$group <- 3
group4 <-new_data[which(new_data$age >=60 & new_data$age <= 100),]
group4$group <- 4
x <- rbind(group1,group2,group3,group4)
x$group <- factor(x$group, levels = c(1,2,3,4))
#boxplot(words ~ group, data = x, ylab = "words",xlab = "Group")
#barplot(height = table(x$group),names.arg=names(table(x$group)))
p<-ggplot(x, aes(x=group, y=words, color=group)) +
  geom_boxplot()
p

Part 2: Topic Modeling:

What do they focus on? In this part, I would like to know these four groups concentrations and focuses. Therefore, I counted the frequency of words in four groups seperately by using tidytext package and tried to find the most frequently used words in each group’s happy moment. Below are the word clouds

Young_adult <- new_data %>% 
  select(wid, cleaned_hm, gender, age, text,count,predicted_category) %>%
  filter(age>=17 & age<=22)%>%
  mutate(group="Below 22")

Adult <- new_data %>% 
  select(wid, cleaned_hm, gender, age, text,count,predicted_category) %>%
  filter(age>=23 & age<=39)%>%
  mutate(group="23 to 39")

Middle_age <- new_data %>% 
  select(wid, cleaned_hm, gender, age, text,count,predicted_category) %>%
  filter(age>=40 & age<=59)%>%
  mutate(group = "40 to 59")

Senior <- new_data %>% 
  select(wid, cleaned_hm, gender, age, text,count,predicted_category) %>%
  filter(age>=60 & age <=100)%>%
  mutate(group = "Above 60")

# Young_adult group
tidy_Young <- Young_adult %>%
  unnest_tokens(word, text)

freq_Young <- tidy_Young %>%
  count(word, sort = TRUE) 

wordcloud(freq_Young$word,freq_Young$n,
          scale=c(5,0.2),
          max.words=100,
          min.freq=1,
          random.order=FALSE,
          rot.per=0.3,
          use.r.layout=T,
          random.color=FALSE,
          colors=brewer.pal(9, "Accent"))

# Adult group
tidy_Adult <- Adult %>%
  unnest_tokens(word, text)

freq_Adult <- tidy_Adult %>%
  count(word, sort = TRUE) 

wordcloud(freq_Adult$word,freq_Adult$n,
          scale=c(4,0.2),
          max.words=100,
          min.freq=1,
          random.order=FALSE,
          rot.per=0.3,
          use.r.layout=T,
          random.color=FALSE,
          colors=brewer.pal(9, "Accent"))

# Middle_age group
tidy_Middle <- Middle_age %>%
  unnest_tokens(word, text)

freq_Middle <- tidy_Middle %>%
  count(word, sort = TRUE) 

wordcloud(freq_Middle$word,freq_Middle$n,
          scale=c(4,0.2),
          max.words=100,
          min.freq=1,
          random.order=FALSE,
          rot.per=0.3,
          use.r.layout=T,
          random.color=FALSE,
          colors=brewer.pal(9, "Accent"))

# Senior group
tidy_Senior <- Senior %>%
  unnest_tokens(word, text)

freq_Senior <- tidy_Senior %>%
  count(word, sort = TRUE) 

wordcloud(freq_Senior$word,freq_Senior$n,
          scale=c(4,0.2),
          max.words=100,
          min.freq=1,
          random.order=FALSE,
          rot.per=0.3,
          use.r.layout=T,
          random.color=FALSE,
          colors=brewer.pal(9, "Accent"))

Word frequencies compare: Comparing the word frequencies of group 1 - 4

#library(tidyr)
frequency <- bind_rows(mutate(tidy_Young, age = "Below 22"),
                       mutate(tidy_Adult, age = "23 to 39"), 
                       mutate(tidy_Middle, age = "40 to 59"),
                       mutate(tidy_Senior, age = "Above 60")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(age, word) %>%
  group_by(age) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(age, proportion) %>% 
  gather(age, proportion, `23 to 39`:`40 to 59`:`Above 60`)

#library(scales)
ggplot(frequency, aes(x = proportion, y = `Below 22`, color = abs(`Below 22` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~age, ncol = 3) +
  theme(legend.position="none") +
  labs(y = "Below 22", x = NULL)

cor.test(data = frequency[frequency$age == "23 to 39",],
         ~ proportion + `Below 22`)$conf.int[1]
## [1] 0.9497406
cor.test(data = frequency[frequency$age == "40 to 59",], 
         ~ proportion + `Below 22`)$conf.int[1]
## [1] 0.8307822
cor.test(data = frequency[frequency$age == "Above 60",], 
         ~ proportion + `Below 22`)$conf.int[1]
## [1] 0.7553508

Words that are close to the line in these plots have similar frequencies in both sets of texts, for example, in both Austen and Brontë texts (“miss”, “time”, “day” at the upper frequency end) or in both Austen and Wells texts (“time”, “day”, “brother” at the high frequency end). Words that are far from the line are words that are found more in one set of texts than another. For example, in the Austen-Brontë panel, words like “elizabeth”, “emma”, and “fanny” (all proper nouns) are found in Austen’s texts but not much in the Brontë texts, while words like “arthur” and “dog” are found in the Brontë texts but not the Austen texts. In comparing H.G. Wells with Jane Austen, Wells uses words like “beast”, “guns”, “feet”, and “black” that Austen does not, while Austen uses words like “family”, “friend”, “letter”, and “dear” that Wells does not.

Overall, notice in Figure 1.3 that the words in the Austen-Brontë panel are closer to the zero-slope line than in the Austen-Wells panel. Also notice that the words extend to lower frequencies in the Austen-Brontë panel; there is empty space in the Austen-Wells panel at low frequency. These characteristics indicate that Austen and the Brontë sisters use more similar words than Austen and H.G. Wells. Also, we see that not all the words are found in all three sets of texts and there are fewer data points in the panel for Austen and H.G. Wells.

Let’s quantify how similar and different these sets of word frequencies are using a correlation test. How correlated are the word frequencies between Austen and the Brontë sisters, and between Austen and Wells?

frequency <- bind_rows(mutate(tidy_Adult, age = "23 to 39"), 
                       mutate(tidy_Middle, age = "40 to 59"),
                       mutate(tidy_Senior, age = "Above 60")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(age, word) %>%
  group_by(age) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(age, proportion) %>% 
  gather(age, proportion, `40 to 59`:`Above 60`)

ggplot(frequency, aes(x = proportion, y = `23 to 39`, color = abs(`23 to 39` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~age, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "23 to 39", x = NULL)
## Warning: Removed 27485 rows containing missing values (geom_point).
## Warning: Removed 27485 rows containing missing values (geom_text).

cor.test(data = frequency[frequency$age == "40 to 59",], 
         ~ proportion + `23 to 39`)$conf.int[1]
## [1] 0.9438877
cor.test(data = frequency[frequency$age == "Above 60",], 
         ~ proportion + `23 to 39`)$conf.int[1]
## [1] 0.8709213
frequency <- bind_rows( mutate(tidy_Middle, age = "40 to 59"),
                       mutate(tidy_Senior, age = "Above 60")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(age, word) %>%
  group_by(age) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(age, proportion) %>% 
  gather(age, proportion, `Above 60`)

ggplot(frequency, aes(x = proportion, y = `40 to 59`, color = abs(`40 to 59` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme(legend.position="none") +
  labs(y = "40 to 59", x = NULL)
## Warning: Removed 5363 rows containing missing values (geom_point).
## Warning: Removed 5363 rows containing missing values (geom_text).

cor.test(data = frequency[frequency$age == "Above 60",], 
         ~ proportion + `40 to 59`)$conf.int[1]
## [1] 0.9319521

Relationships between words: n-grams

#library(dplyr)
#library(tidytext)
#library(janeaustenr)
#library(tidyr)

#
Young_bigrams <- Young_adult %>%
  filter(count != 1) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

Young_counts <- Young_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  count(word1, word2, sort = TRUE)

Young_united <- Young_counts %>%
  unite(bigram, word1, word2, sep = " ")
Young_united$group <- "Below 22"


#
Adult_bigrams <- Adult %>%
  filter(count != 1) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

Adult_counts <- Adult_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  count(word1, word2, sort = TRUE)

Adult_united <- Adult_counts %>%
  unite(bigram, word1, word2, sep = " ")
Adult_united$group <- "23 to 39"

#
Middle_bigrams <- Middle_age %>%
  filter(count != 1) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

Middle_counts <- Middle_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  count(word1, word2, sort = TRUE)

Middle_united <- Middle_counts %>%
  unite(bigram, word1, word2, sep = " ")
Middle_united$group <- "40-59"

#
Senior_bigrams <- Senior %>%
  filter(count != 1) %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

Senior_counts <- Senior_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  count(word1, word2, sort = TRUE)

Senior_united <- Senior_counts %>%
  unite(bigram, word1, word2, sep = " ")
Senior_united$group <- "Above 60"

#
age_bigrams <- rbind(Young_united,Adult_united,Middle_united,Senior_united)

#
Young_united %>%
  arrange(desc(n)) %>%
  mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(group) %>% 
  top_n(15) %>% 
  ungroup %>%
  ggplot(aes(word, n, fill = group)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~group, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by word

#
Adult_united %>%
  arrange(desc(n)) %>%
  mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(group) %>% 
  top_n(15) %>% 
  ungroup %>%
  ggplot(aes(word, n, fill = group)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~group, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by word

# 
Middle_united %>%
  arrange(desc(n)) %>%
  mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(group) %>% 
  top_n(15) %>% 
  ungroup %>%
  ggplot(aes(word, n, fill = group)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~group, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by word

#
Senior_united %>%
  arrange(desc(n)) %>%
  mutate(word = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(group) %>% 
  top_n(15) %>% 
  ungroup %>%
  ggplot(aes(word, n, fill = group)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~group, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by word

Much as we discovered in Chapter 3, the units that distinguish each Austen book are almost exclusively names. We also notice some pairings of a common verb and a name, such as “replied elizabeth” in Pride & Prejudice, or “cried emma” in Emma.

There are advantages and disadvantages to examining the tf-idf of bigrams rather than individual words. Pairs of consecutive words might capture structure that isn’t present when one is just counting single words, and may provide context that makes tokens more understandable (for example, “pulteney street”, in Northanger Abbey, is more informative than “pulteney”). However, the per-bigram counts are also sparser: a typical two-word pair is rarer than either of its component words. Thus, bigrams can be especially useful when you have a very large text dataset.

heatmap for predicted-category (7 category) / Dendrograms

x <- table(Young_adult$predicted_category)
y <- table(Adult$predicted_category)
z <- table(Middle_age$predicted_category)
w <- table(Senior$predicted_category)

coln <- c("achievement","affection","boonding","enjoy_the_moment","exercise","leisure","nature")
rown <- c("Below 22","23 to 39","40 to 59","Above 60")
freq_category <- matrix(c(x,y,z,w),nrow=4,byrow = TRUE)
rownames(freq_category) <- rown
colnames(freq_category) <- coln
#freq_category
# legend (Sunrise)  
heatmap(freq_category,cexRow=1,cexCol = 0.6)